# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
Load in your dataset and describe its properties through the questions below. Try and motivate your exploration goals through this section.
# load data
df = pd.read_csv('data.csv')
df.head()
df.describe()
#changing data type of start/end_time to datetime and bike_share_for_all_trip to bool.
df.start_time = pd.to_datetime(df.start_time)
df.end_time = pd.to_datetime(df.end_time)
df.bike_share_for_all_trip = (df.bike_share_for_all_trip == 'Yes')
df.info()
df.isnull().sum()
df.dropna(inplace=True)
df.isnull().sum()
print(df.shape)
The dataset consists of 183412 record and 16 columns. Out of these 16 columnsm nine are numerical, two are datetime, four are object and one is boolean type. The columns can be grouped into four main categories
I would like to invistigate which factors affect trip duration
In this section, investigate distributions of individual variables. If you see unusual points or outliers, take a deeper look to clean things up and prepare yourself to look at relationships between variables.
## Here I will create two columns that will help me doing invistigation
## add two columns for days and hours
df['start_time_weekday'] = df['start_time'].dt.day_name()
df['start_time_weekday'] = pd.Categorical(df.start_time_weekday, ordered=True,
categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
df['start_time_hour'] = df['start_time'].dt.hour
df.head()
def get_bins(column, max_value:int=None, min_value:int=None, number_of_bins:int=20):
"""
split data range into required number of bins
Args:
column: pandas series
max/min_value: max and min value for columns
number_of_bins: required number of bins, default = 20
return: numpy array have the splitted numbers
"""
if not max_value:
max_value = column.max()
if not min_value:
min_value = column.min()
step = (max_value - min_value) / number_of_bins
bins = np.arange(min_value, max_value+step, step)
return bins
def get_column_key_value(column):
"""
takes a panda series and return a tuple that have
the series keys and values
"""
return column.value_counts().keys(), column.value_counts()
# plotting the histogram of trip duration
plt.hist(data=df, x='duration_sec', bins=get_bins(df.duration_sec))
print(f"Max Duration = {df.duration_sec.max()} ms")
# plotting trip duraion in log scale
bins = get_bins(df.duration_sec, max_value=5e3, number_of_bins=1000)
plt.hist(data = df, x = 'duration_sec', bins = bins)
plt.xscale('log')
plt.xticks([100, 200, 500, 1e3, 2e3, 5e3], [100, 200, 500, '1k', '2k', '5k']);
df = df[df.duration_sec < 2e3]
df['age'] = np.array(2019 - df.member_birth_year).astype(int)
plt.hist(df.age, bins=get_bins(df.age, number_of_bins=10));
df = df[df.age < 70]
plt.hist(df.age, bins=get_bins(df.age));
df.columns
df.start_station_id.hist(bins=100);
df.end_station_id.hist(bins=100);
# bar plot for user type
x, y = get_column_key_value(df.user_type)
sb.barplot(x=x, y=y);
# bar plot for member gender
x, y = get_column_key_value(df.member_gender)
sb.barplot(x=x, y=y);
# Plot bar plot for starting time vs days
x, y = get_column_key_value(df.start_time_weekday)
sb.barplot(x=x, y=y);
plt.xticks(rotation = 45);
# Plot bar plot for starting time vs hours
x, y = get_column_key_value(df.start_time.dt.hour)
sb.barplot(x=x, y=y);
The trip duration has a huge amount of values, most of them are in the lower part. so I used log scale to generate a better visualization and remove outliers
I subtracted 2019 from the member_birth_year to get Age
In this section, investigate relationships between pairs of variables in your data. Make sure the variables that you cover here have been introduced in some fashion in the previous section (univariate exploration).
# plot scatter plot and heat map for age and trip duration
# scatter plot
plt.scatter(df.age, df.duration_sec, alpha = 0.2, marker='.');
plt.title('Scatter plot for Trip duration vs Age')
plt.xlabel('Age')
plt.ylabel('Trip Duration (sec)')
plt.show()
# Heat map
# Specify bin edges
bins_x = get_bins(df.age)
bins_y = get_bins(df.duration_sec)
plt.hist2d(data=df, x='age', y='duration_sec', bins=[bins_x, bins_y]);
plt.colorbar();
plt.title('Heatmap for Trip duration vs Age')
plt.xlabel('Age')
plt.ylabel('Trip Duration (sec)')
plt.show()
sb.displot(data=df, x='duration_sec', hue='member_gender', bins=get_bins(df.duration_sec));
## violin and box plot for member gender vs trip duration in sec
sb.violinplot(data=df, x='member_gender', y='duration_sec');
plt.show()
sb.boxplot(data=df, x='member_gender', y='duration_sec');
## violin and box plot for user type vs trip duration in sec
sb.violinplot(data=df, x='user_type', y='duration_sec');
plt.show()
sb.boxplot(data=df, x='user_type', y='duration_sec');
# Creating Facet Grid
# add histogram of trip duration agains days
g = sb.FacetGrid(df, col='start_time_weekday', col_wrap=4);
g.map(sb.histplot, 'duration_sec');
# Creating Facet Grid and add histogram of trip duration agains hours
g = sb.FacetGrid(df, col='start_time_hour', col_wrap=6)
g.map(sb.histplot, 'duration_sec');
## plot the summation of trips against start station id
df.groupby('start_station_id').sum().reset_index().plot(x='start_station_id', y='duration_sec');
## plot the summation of trips against end station id
df.groupby('end_station_id').sum().reset_index().plot(x='end_station_id', y='duration_sec');
The trip duration is strongly associated with age, start/end station, and user_type, and of course the trip hour
Intrestingly, Females have higher trip duration than males. Weekends also have higher trip duration, maybe because users ride bike for fun instead of going to a pre-determined place.
Create plots of three or more variables to investigate your data even further. Make sure that your investigations are justified, and follow from your work in the previous sections.
## Create subplots consisting of 1 row and 2 columns
fig, ax = plt.subplots(1, 2)
## set axis 0 as the current axis
plt.sca(ax[0])
## Point plot for trip duration vs member gender
sb.pointplot(data = df, x = 'member_gender', y = 'duration_sec', hue = 'user_type', dodge = 0.3, linestyles = "");
## set axis 1 as the current axis
plt.sca(ax[1])
## Bar plot for trip duration vs member gender
sb.barplot(data = df, x = 'member_gender', y = 'duration_sec', hue = 'user_type');
ax[1].legend();
## Facet Grid for trip duration vs Age splitted by gender
g = sb.FacetGrid(data=df, col='member_gender', col_wrap=2, height=5);
g.map(plt.scatter, 'age', 'duration_sec', alpha=0.01);
## Facet Grid for trip duration vs Age splitted by days
g = sb.FacetGrid(data=df, col='start_time_weekday', col_wrap=3, height=5);
g.map(plt.scatter, 'age', 'duration_sec', alpha=0.01);
## Facet Grid for trip duration vs Age splitted by hours
g = sb.FacetGrid(data=df, col='start_time_hour', col_wrap=4, height=5);
g.map(plt.scatter, 'age', 'duration_sec', alpha=0.1);
We observed that customer do longer trips than subscribers, Although females do longer trips than males, older males do longer trips than females. The most portion using go ford bikes are those who at the working age, and they probably do their trips on weekdays
It is nice to see older people do morning bike rides.
At the end of your report, make sure that you export the notebook as an html file from the
File > Download as... > HTMLmenu. Make sure you keep track of where the exported file goes, so you can put it in the same folder as this notebook for project submission. Also, make sure you remove all of the quote-formatted guide notes like this one before you finish your report!